#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
#sns.set_theme()
from pprint import pprint
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram, linkage,cophenet
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance
from sklearn.decomposition import PCA
#Setting styles for sns
sns.set_theme(context="notebook", style="darkgrid",font_scale=1.2, rc={"lines.linewidth": 2.5})
#sns.set_style('darkgrid')
# Graphing functions
#combined plot for box and hist
def histogram_boxplot(feature, figsize=(15,10), bins = None):
""" Boxplot and histogram combined
feature: 1-d feature array
figsize: size of fig (default (9,8))
bins: number of bins (default None / auto)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(nrows = 2, # Number of rows of the subplot grid= 2
sharex = True, # x-axis will be shared among all subplots
gridspec_kw = {"height_ratios": (.25, .75)},
figsize = figsize
) # creating the 2 subplots
sns.boxplot(x=feature, ax=ax_box2, showmeans=True, color='violet') # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(feature, kde=False, ax=ax_hist2, bins=bins,palette="winter") if bins else sns.histplot(feature, kde=True, ax=ax_hist2) # For histogram
ax_hist2.axvline(np.mean(feature), color='green', linestyle='--') # Add mean to the histogram
ax_hist2.axvline(np.median(feature), color='black', linestyle='-') # Add median to the histogram
def show_values_on_bars(axs, decimals=False):
def _show_on_single_plot(ax):
for p in ax.patches:
_x = p.get_x() + p.get_width() / 2
_y = p.get_y() + p.get_height()
if decimals:
value = '{:.2f}'.format(p.get_height())
else:
value = '{:.0f}'.format(p.get_height())
ax.text(_x, _y, value, ha="center")
if isinstance(axs, np.ndarray):
for idx, ax in np.ndenumerate(axs):
_show_on_single_plot(ax)
else:
_show_on_single_plot(axs)
#Modelling clusters in 2d space
def showClustersIn2D(model, df, title='Cluster Plot', ax=None, centroid_size=40, point_size=15):
'''Takes in a clustering model, a dataframe and plots the points in 2d using PCA to project the n dimensional dataset onto
2D. Can also optionally pass axes if using subplots, custom title and sizes for points and centroids
'''
model.fit_predict(df)
x = model.cluster_centers_
pca2 = PCA(n_components=2)
z=np.append(np.asarray(df),x,axis=0)
z2 = pca2.fit_transform(z)
zdf = pd.DataFrame(z2, columns = ['x','y'])
labels = np.append(model.labels_,['Centroids']*3)
#labels
sizes={'Centroids':centroid_size}
for a in model.labels_:
sizes[str(a)]=point_size
#print(sizes)
zdf['clusters'] = labels
plt.suptitle(title)
if ax==None:
kmeansclusterplot = sns.scatterplot(data = zdf, x=zdf.x, y = zdf.y, hue=zdf.clusters, size=zdf.clusters, sizes=sizes)
else:
kmeansclusterplot = sns.scatterplot(data = zdf, x=zdf.x, y = zdf.y, hue=zdf.clusters, ax=ax, size=zdf.clusters, sizes=sizes)
#-------------------
# Other Functions and Class
#color highlighting for max vals in df
def highlight_max(data, color='yellow'):
'''
highlight the maximum in a Series or DataFrame
source: https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html
'''
attr = 'background-color: {}'.format(color)
if data.ndim == 1: # Series from .apply(axis=0) or axis=1
is_max = data == data.max()
return [attr if v else '' for v in is_max]
else: # from .apply(axis=None)
is_max = data == data.max().max()
return pd.DataFrame(np.where(is_max, attr, ''),
index=data.index, columns=data.columns)
#Defining a child class to AgglomerativeClustering which has a predict method (needed by yellowbrick visualiser)
class AgglomerativeClusteringWithPredict(AgglomerativeClustering):
def __init__(self, fname, lname):
super().__init__(fname, lname)
def __init__(self, n_clusters, affinity, linkage):
super().__init__(n_clusters, affinity=affinity, linkage=linkage)
def predict(self, data):
self.labels_ = super().fit_predict(data)
def makeClusterCenterList(self):
temp=[]
#print(self.df.columns)
for a in set(self.labels_):
#print(a)
x = self.df[self.df['AgglomerativeClusteringWithPredictLabels']==a].mean(axis=0)
#print(x[:-1])
temp.append(list(x[:-1]))
del(self.df)
self.cluster_centers_ = np.array(temp)
def fit_predict(self, df):
self.labels_ = super().fit_predict(df)
self.df = df.copy()
self.df['AgglomerativeClusteringWithPredictLabels'] = self.labels_
self.makeClusterCenterList()
return self.labels_
df = pd.DataFrame()
cluster_centers_ = []
labels_ = []
# Reading data
data = pd.read_excel('Credit Card Customer Data.xlsx')
data
| Sl_No | Customer Key | Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 87073 | 100000 | 2 | 1 | 1 | 0 |
| 1 | 2 | 38414 | 50000 | 3 | 0 | 10 | 9 |
| 2 | 3 | 17341 | 50000 | 7 | 1 | 3 | 4 |
| 3 | 4 | 40496 | 30000 | 5 | 1 | 1 | 4 |
| 4 | 5 | 47437 | 100000 | 6 | 0 | 12 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 655 | 656 | 51108 | 99000 | 10 | 1 | 10 | 0 |
| 656 | 657 | 60732 | 84000 | 10 | 1 | 13 | 2 |
| 657 | 658 | 53834 | 145000 | 8 | 1 | 9 | 1 |
| 658 | 659 | 80655 | 172000 | 10 | 1 | 15 | 0 |
| 659 | 660 | 80150 | 167000 | 9 | 0 | 12 | 2 |
660 rows × 7 columns
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Sl_No | 660.0 | 330.500000 | 190.669872 | 1.0 | 165.75 | 330.5 | 495.25 | 660.0 |
| Customer Key | 660.0 | 55141.443939 | 25627.772200 | 11265.0 | 33825.25 | 53874.5 | 77202.50 | 99843.0 |
| Avg_Credit_Limit | 660.0 | 34574.242424 | 37625.487804 | 3000.0 | 10000.00 | 18000.0 | 48000.00 | 200000.0 |
| Total_Credit_Cards | 660.0 | 4.706061 | 2.167835 | 1.0 | 3.00 | 5.0 | 6.00 | 10.0 |
| Total_visits_bank | 660.0 | 2.403030 | 1.631813 | 0.0 | 1.00 | 2.0 | 4.00 | 5.0 |
| Total_visits_online | 660.0 | 2.606061 | 2.935724 | 0.0 | 1.00 | 2.0 | 4.00 | 15.0 |
| Total_calls_made | 660.0 | 3.583333 | 2.865317 | 0.0 | 1.00 | 3.0 | 5.00 | 10.0 |
data.isna().sum()
Sl_No 0 Customer Key 0 Avg_Credit_Limit 0 Total_Credit_Cards 0 Total_visits_bank 0 Total_visits_online 0 Total_calls_made 0 dtype: int64
data.nunique()
Sl_No 660 Customer Key 655 Avg_Credit_Limit 110 Total_Credit_Cards 10 Total_visits_bank 6 Total_visits_online 16 Total_calls_made 11 dtype: int64
categorical_cols = list(data.columns[-4:])
pprint(categorical_cols)
['Total_Credit_Cards', 'Total_visits_bank', 'Total_visits_online', 'Total_calls_made']
# fixing column names to avoid spaces
data.columns = [c.replace(' ','_') for c in data.columns]
#which customer keys are repeated
x=data.Customer_Key.value_counts()
x=x[x>1]
y=x.index.tolist()
print('Customer Keys which have multiple entries in dataset are:')
pprint(y)
Customer Keys which have multiple entries in dataset are: [50706, 37252, 97935, 96929, 47437]
#examining rows in data where customer key is same
for c in y:
display(data[data.Customer_Key==c])
| Sl_No | Customer_Key | Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | |
|---|---|---|---|---|---|---|---|
| 411 | 412 | 50706 | 44000 | 4 | 5 | 0 | 2 |
| 541 | 542 | 50706 | 60000 | 7 | 5 | 2 | 2 |
| Sl_No | Customer_Key | Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | |
|---|---|---|---|---|---|---|---|
| 48 | 49 | 37252 | 6000 | 4 | 0 | 2 | 8 |
| 432 | 433 | 37252 | 59000 | 6 | 2 | 1 | 2 |
| Sl_No | Customer_Key | Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | |
|---|---|---|---|---|---|---|---|
| 104 | 105 | 97935 | 17000 | 2 | 1 | 2 | 10 |
| 632 | 633 | 97935 | 187000 | 7 | 1 | 7 | 0 |
| Sl_No | Customer_Key | Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | |
|---|---|---|---|---|---|---|---|
| 391 | 392 | 96929 | 13000 | 4 | 5 | 0 | 0 |
| 398 | 399 | 96929 | 67000 | 6 | 2 | 2 | 2 |
| Sl_No | Customer_Key | Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | |
|---|---|---|---|---|---|---|---|
| 4 | 5 | 47437 | 100000 | 6 | 0 | 12 | 3 |
| 332 | 333 | 47437 | 17000 | 7 | 3 | 1 | 0 |
working_df = data.iloc[:,2:] #defining working_df and not including sl_no and customer key as they dont add to our analysis
working_df
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | |
|---|---|---|---|---|---|
| 0 | 100000 | 2 | 1 | 1 | 0 |
| 1 | 50000 | 3 | 0 | 10 | 9 |
| 2 | 50000 | 7 | 1 | 3 | 4 |
| 3 | 30000 | 5 | 1 | 1 | 4 |
| 4 | 100000 | 6 | 0 | 12 | 3 |
| ... | ... | ... | ... | ... | ... |
| 655 | 99000 | 10 | 1 | 10 | 0 |
| 656 | 84000 | 10 | 1 | 13 | 2 |
| 657 | 145000 | 8 | 1 | 9 | 1 |
| 658 | 172000 | 10 | 1 | 15 | 0 |
| 659 | 167000 | 9 | 0 | 12 | 2 |
660 rows × 5 columns
histogram_boxplot(working_df['Avg_Credit_Limit'])
#Interactive plot to hover and check outlier values, median, fences etc
px.box(working_df['Avg_Credit_Limit'], orientation='h', height=200)
#Plot cdf of feature
sns.ecdfplot(data=working_df,x='Avg_Credit_Limit')
<AxesSubplot:xlabel='Avg_Credit_Limit', ylabel='Proportion'>
working_df['Avg_Credit_Limit'].describe().T
count 660.000000 mean 34574.242424 std 37625.487804 min 3000.000000 25% 10000.000000 50% 18000.000000 75% 48000.000000 max 200000.000000 Name: Avg_Credit_Limit, dtype: float64
#all_col = working_df.iloc[:,1:].columns
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Histogram of Categorical (numerical) variables', fontsize=20)
row = 0
for i,column in enumerate(categorical_cols):
zz=sns.countplot(ax=axes[0 if i<2 else 1,i%2],x=working_df[column], palette='crest_r')
show_values_on_bars(zz)
#counter = counter+1
fig.tight_layout(pad=2.0)
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Boxplot of Categorical (numerical) variables', fontsize=20)
row = 0
for i,column in enumerate(categorical_cols):
zz=sns.boxplot(ax=axes[0 if i<2 else 1,i%2],x=working_df[column], palette='Blues')
show_values_on_bars(zz)
#counter = counter+1
fig.tight_layout(pad=2.0)
#Interactive chart to check outliers for total visits online
px.box(working_df['Total_visits_online'], orientation='h', height=200)
#Plotting CDFs
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('CDF of Categorical (numerical) variables', fontsize=20)
row = 0
for i,column in enumerate(categorical_cols):
zz=sns.ecdfplot(ax=axes[0 if i<2 else 1,i%2],x=working_df[column], palette='Blues')
show_values_on_bars(zz)
#counter = counter+1
fig.tight_layout(pad=2.0)
working_df.loc[:,categorical_cols].describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Total_Credit_Cards | 660.0 | 4.706061 | 2.167835 | 1.0 | 3.0 | 5.0 | 6.0 | 10.0 |
| Total_visits_bank | 660.0 | 2.403030 | 1.631813 | 0.0 | 1.0 | 2.0 | 4.0 | 5.0 |
| Total_visits_online | 660.0 | 2.606061 | 2.935724 | 0.0 | 1.0 | 2.0 | 4.0 | 15.0 |
| Total_calls_made | 660.0 | 3.583333 | 2.865317 | 0.0 | 1.0 | 3.0 | 5.0 | 10.0 |
sns.heatmap(working_df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
<AxesSubplot:>
sns.pairplot(working_df, diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x7f8288c8e850>
#setting random_state
random_state = 314159
# Scaling the data set before clustering
scaler=StandardScaler()
scaled_data=scaler.fit_transform(working_df)
df_scaled = pd.DataFrame(scaled_data, columns = working_df.columns)
df_scaled
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | |
|---|---|---|---|---|---|
| 0 | 1.740187 | -1.249225 | -0.860451 | -0.547490 | -1.251537 |
| 1 | 0.410293 | -0.787585 | -1.473731 | 2.520519 | 1.891859 |
| 2 | 0.410293 | 1.058973 | -0.860451 | 0.134290 | 0.145528 |
| 3 | -0.121665 | 0.135694 | -0.860451 | -0.547490 | 0.145528 |
| 4 | 1.740187 | 0.597334 | -1.473731 | 3.202298 | -0.203739 |
| ... | ... | ... | ... | ... | ... |
| 655 | 1.713589 | 2.443892 | -0.860451 | 2.520519 | -1.251537 |
| 656 | 1.314621 | 2.443892 | -0.860451 | 3.543188 | -0.553005 |
| 657 | 2.937092 | 1.520613 | -0.860451 | 2.179629 | -0.902271 |
| 658 | 3.655235 | 2.443892 | -0.860451 | 4.224968 | -1.251537 |
| 659 | 3.522245 | 1.982253 | -1.473731 | 3.202298 | -0.553005 |
660 rows × 5 columns
df_scaled.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Avg_Credit_Limit | 660.0 | 8.612639e-17 | 1.000758 | -0.839808 | -0.653623 | -0.440840 | 0.357097 | 4.399975 |
| Total_Credit_Cards | 660.0 | 4.306320e-17 | 1.000758 | -1.710864 | -0.787585 | 0.135694 | 0.597334 | 2.443892 |
| Total_visits_bank | 660.0 | -4.306320e-17 | 1.000758 | -1.473731 | -0.860451 | -0.247170 | 0.979390 | 1.592670 |
| Total_visits_online | 660.0 | 7.536059e-17 | 1.000758 | -0.888380 | -0.547490 | -0.206600 | 0.475180 | 4.224968 |
| Total_calls_made | 660.0 | -8.612639e-17 | 1.000758 | -1.251537 | -0.902271 | -0.203739 | 0.494794 | 2.241125 |
fig, ax = plt.subplots(1,5, figsize=(20,5))
for i, col in enumerate(df_scaled.columns):
sns.kdeplot(df_scaled[col], ax=ax[i])
clusters=range(1,9) # Although EDA shows likely 3-6 clusters we explore 1-8 (also needed for visualisations below)
meanDistortions=[]
predictions = {}
for k in clusters:
model=KMeans(n_clusters=k, random_state=random_state) #declare model with k clusters
model.fit(df_scaled) #fit model to scaled data
prediction=model.predict(df_scaled) #make predictions of clusters based on scaled data
predictions[k] = prediction
#print(np.min(cdist(df_scaled, model.cluster_centers_, 'euclidean'), axis=1).shape)
#Next we compute the distorition ie euclidean distance (variance) within each cluster
#cdist finds distances between two matrices - we use euclidean which is passed as parameter
#the output contains the distance of each row from each centroid
#since we need only within cluster variance, we use np.min to select the minimum for each row which
#of course is its own cluster centroid
#this is then divided by no of rows to convert to variance
distortion=sum(np.min(cdist(df_scaled, model.cluster_centers_, 'euclidean'), axis=1)) / df_scaled.shape[0]
meanDistortions.append(distortion)
print(f'Number of Clusters: {k} \t Average Distortion: {distortion:.3f}')
Number of Clusters: 1 Average Distortion: 2.007 Number of Clusters: 2 Average Distortion: 1.457 Number of Clusters: 3 Average Distortion: 1.147 Number of Clusters: 4 Average Distortion: 1.046 Number of Clusters: 5 Average Distortion: 0.991 Number of Clusters: 6 Average Distortion: 0.943 Number of Clusters: 7 Average Distortion: 0.923 Number of Clusters: 8 Average Distortion: 0.889
#Plotting elbow plot
sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 2.5})
sns.lineplot(x=clusters, y=meanDistortions)
plt.xlabel('k')
plt.ylabel('Average Distortion')
plt.title('Selecting k with the Elbow Method', fontsize=20);
#Using alternative yellowbrick elbow visualiser
visualizer = KElbowVisualizer(KMeans(random_state = random_state))
visualizer.fit(df_scaled) # Fit the data to the visualizer
visualizer.show();
# Computing Silhouette scores
sil_score = []
cluster_list = list(range(2,9))
for n_clusters in cluster_list:
clusterer = KMeans(n_clusters=n_clusters, random_state=random_state)
preds = clusterer.fit_predict(df_scaled)
score = silhouette_score(df_scaled, preds, random_state=random_state)
sil_score.append(score)
print(f"For n_clusters = {n_clusters}, silhouette score is {score:.3f})")
For n_clusters = 2, silhouette score is 0.418) For n_clusters = 3, silhouette score is 0.516) For n_clusters = 4, silhouette score is 0.356) For n_clusters = 5, silhouette score is 0.273) For n_clusters = 6, silhouette score is 0.255) For n_clusters = 7, silhouette score is 0.235) For n_clusters = 8, silhouette score is 0.227)
zz=sns.barplot(x=cluster_list,y=sil_score, palette='crest')
show_values_on_bars(zz, decimals=True)
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores for k Clusters (KMeans)', fontsize=20);
# Finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(3, random_state = random_state))
visualizer.fit(df_scaled)
visualizer.show();
# Finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(4, random_state = random_state))
visualizer.fit(df_scaled)
visualizer.show();
# Finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(5, random_state = random_state))
visualizer.fit(df_scaled)
visualizer.show();
# Finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(6, random_state = random_state))
visualizer.fit(df_scaled)
visualizer.show();
#Using yellowbrick visualiser to check cluster dispersion
'''from the docs: Intercluster distance maps display an embedding of the cluster centers in 2 dimensions with the distance to other centers preserved. E.g. the closer to centers are in the visualization, the closer they are in the original feature space. The clusters are sized according to a scoring metric. By default, they are sized by membership, e.g. the number of instances that belong to each center. This gives a sense of the relative importance of clusters. Note however, that because two clusters overlap in the 2D space, it does not imply that they overlap in the original feature space.'''
visualizer = InterclusterDistance(KMeans(3, random_state = random_state),legend_loc='upper left')
visualizer.fit(df_scaled) # Fit the data to the visualizer
visualizer.show();
#Plotting 2d projection of clusters - the custom function uses PCA to project 5 feature space onto 2D
model = KMeans(n_clusters=3, random_state=random_state)
showClustersIn2D(model,df_scaled, ax=None, title='Kmeans Clusters and Centroids')
#Adding a column to working_df with the predictions from our kmeans model where n_clusters = 3
working_df = working_df.assign(kmeans_clusters = predictions[3])
working_df
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | kmeans_clusters | |
|---|---|---|---|---|---|---|
| 0 | 100000 | 2 | 1 | 1 | 0 | 1 |
| 1 | 50000 | 3 | 0 | 10 | 9 | 0 |
| 2 | 50000 | 7 | 1 | 3 | 4 | 1 |
| 3 | 30000 | 5 | 1 | 1 | 4 | 1 |
| 4 | 100000 | 6 | 0 | 12 | 3 | 2 |
| ... | ... | ... | ... | ... | ... | ... |
| 655 | 99000 | 10 | 1 | 10 | 0 | 2 |
| 656 | 84000 | 10 | 1 | 13 | 2 | 2 |
| 657 | 145000 | 8 | 1 | 9 | 1 | 2 |
| 658 | 172000 | 10 | 1 | 15 | 0 | 2 |
| 659 | 167000 | 9 | 0 | 12 | 2 | 2 |
660 rows × 6 columns
x=working_df.groupby('kmeans_clusters').mean().iloc[:,:5]
x['no_of_rows']=working_df.groupby('kmeans_clusters')['Avg_Credit_Limit'].count().values
x
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | no_of_rows | |
|---|---|---|---|---|---|---|
| kmeans_clusters | ||||||
| 0 | 12174.107143 | 2.410714 | 0.933036 | 3.553571 | 6.870536 | 224 |
| 1 | 33782.383420 | 5.515544 | 3.489637 | 0.981865 | 2.000000 | 386 |
| 2 | 141040.000000 | 8.740000 | 0.600000 | 10.900000 | 1.080000 | 50 |
fig, axes = plt.subplots(1, 5, figsize=(20, 8))
fig.suptitle('Boxplot of features for each cluster', fontsize=20)
counter = 0
for i,column in enumerate(working_df.columns[:-1]):
sns.boxplot(ax=axes[i],y=working_df[column],x=working_df.kmeans_clusters, palette='coolwarm_r')
fig.tight_layout(pad=2.0)
# cophenet index is a measure of the correlation between the distance of points in feature space and distance on dendrogram
# closer it is to 1, the better is the clustering
#since there are outliers, it is good practice to check distnce metrics other than eculidean which doesnt handle outliers as well as the others. Mahalanobis is esp used for outlier cases
distance_metrics = [ 'euclidean','chebyshev','mahalanobis','cityblock']
#The following linkage methods work for all metrics listed above
linkage_methods = ['single', 'complete', 'average', 'weighted']
corrs=[]
corr_d = {}
for dm in distance_metrics:
corrs=[]
for lm in linkage_methods:
Z = linkage(df_scaled, metric=dm, method=lm)
c, coph_dists = cophenet(Z , pdist(df_scaled))
corrs.append(c)
corr_d[dm] = corrs
corr_df = pd.DataFrame.from_dict(corr_d, orient='index', columns=linkage_methods)
max_method=corr_df.max().idxmax()
max_metric=corr_df[max_method].idxmax()
max_corr = corr_df.loc[max_metric,max_method]
print("Cophenetic coeff for methods and metrics are:")
#corr_df.style.apply(highlight_max, color='darkorange', axis=None)
display(corr_df.style.apply(highlight_max, color='darkorange', axis=None))
print(f'\nMax value of coeff is {max_corr:.5f} which is for metric: {max_metric} and method: {max_method}')
Cophenetic coeff for methods and metrics are:
| single | complete | average | weighted | |
|---|---|---|---|---|
| euclidean | 0.739122 | 0.859973 | 0.897708 | 0.886175 |
| chebyshev | 0.738235 | 0.853347 | 0.897416 | 0.891362 |
| mahalanobis | 0.705806 | 0.666353 | 0.832699 | 0.780599 |
| cityblock | 0.725238 | 0.873148 | 0.896329 | 0.882552 |
Max value of coeff is 0.89771 which is for metric: euclidean and method: average
linkage_methods = ['single', 'complete', 'average','median','ward', 'weighted', 'centroid']
corrs=[]
corr_d2 = {}
for lm in linkage_methods:
Z = linkage(df_scaled, metric='euclidean', method=lm)
c, coph_dists = cophenet(Z , pdist(df_scaled))
corrs.append(c)
corr_d2['euclidean'] = corrs
corr_df2 = pd.DataFrame.from_dict(corr_d2, orient='index',columns=linkage_methods)
max_method2=corr_df2.max().idxmax()
max_metric2=corr_df2[max_method].idxmax()
max_corr2 = corr_df2.loc[max_metric,max_method]
print("Cophenetic coeff for methods and metrics are:")
display(corr_df2.style.apply(highlight_max, color='darkorange', axis=None))
print(f'\nMax value of coeff is {max_corr2:.6f} which is for metric: {max_metric2} and method: {max_method2}')
Cophenetic coeff for methods and metrics are:
| single | complete | average | median | ward | weighted | centroid | |
|---|---|---|---|---|---|---|---|
| euclidean | 0.739122 | 0.859973 | 0.897708 | 0.889380 | 0.741516 | 0.886175 | 0.893939 |
Max value of coeff is 0.897708 which is for metric: euclidean and method: average
%%time
# Create lists to save results of coph calculation
compare_cols = ['Linkage', 'Cophenetic Coefficient']
compare = []
# Create a subplot image
fig, axs = plt.subplots(len(linkage_methods), 1, figsize=(20, 40))
# Enumerate through the list of all methods above
# Get linkage, plot dendrogram, calculate cophenetic coefficient
for i, method in enumerate(linkage_methods):
Z = linkage(df_scaled, metric='euclidean', method=method)
dendrogram(Z, ax=axs[i]);
axs[i].set_title(f'Dendrogram ({method.capitalize()} Linkage)')
coph_corr, coph_dist = cophenet(Z, pdist(df_scaled))
axs[i].annotate(f'Cophenetic\nCoefficient\n{coph_corr:0.2f}',
(0.80, 0.80),
xycoords='axes fraction')
compare.append([method, coph_corr])
CPU times: user 6.21 s, sys: 255 ms, total: 6.46 s Wall time: 5.07 s
mod2 = AgglomerativeClusteringWithPredict(n_clusters=3,affinity='euclidean', linkage='average')
x=mod2.fit_predict(df_scaled)
y=AgglomerativeClustering(n_clusters=3,affinity='euclidean',linkage='average').fit_predict(df_scaled)
print(f'Making sure that predictions returned by my custom class are the same as those returned by the parent class: {sum(x==y)==working_df.shape[0]}')
#working_df['HC_Clusters'] = HCmodel.labels_
Making sure that predictions returned by my custom class are the same as those returned by the parent class: True
#Examining silhouette scores for various values of n_clusters
silscores = {}
for i in range(2,10):
HCmodel = AgglomerativeClusteringWithPredict(n_clusters=i, affinity='euclidean', linkage='average')
HCmodel.fit(df_scaled)
silscores[i] = silhouette_score(df_scaled,HCmodel.labels_)
print("silhouette scores for agglomerative clustering with various values of n_clusters are:")
silscores
silhouette scores for agglomerative clustering with various values of n_clusters are:
{2: 0.5703183487340514,
3: 0.515922432650965,
4: 0.47495143595793504,
5: 0.44039753024783956,
6: 0.4153547954831452,
7: 0.4183775674672025,
8: 0.34306710358280806,
9: 0.3415486932890892}
sildf=pd.DataFrame.from_dict(silscores, orient='index', columns=['Silhouette Scores'])#.plot(kind='bar')
zz=sns.barplot(data=sildf, x=sildf.index, y=sildf['Silhouette Scores'], palette='crest')
show_values_on_bars(zz, decimals=True)
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores for k Clusters (HC)', fontsize=20);
del(sildf)
clusters=range(2,9) # Although EDA shows likely 3-6 clusters we explore 1-8 (also needed for visualisations below)
meanDistortions=[]
predictions = {}
for k in clusters:
model=AgglomerativeClusteringWithPredict(n_clusters=k, affinity='euclidean', linkage='average') #declare model with k clusters
model.fit(df_scaled) #fit model to scaled data
prediction=model.fit_predict(df_scaled) #make predictions of clusters based on scaled data
predictions[k] = prediction
#print(np.min(cdist(df_scaled, model.cluster_centers_, 'euclidean'), axis=1).shape)
#Next we compute the distorition ie euclidean distance (variance) within each cluster
#cdist finds distances between two matrices - we use euclidean which is passed as parameter
#the output contains the distance of each row from each centroid
#since we need only within cluster variance, we use np.min to select the minimum for each row which
#of course is its own cluster centroid
#this is then divided by no of rows to convert to variance
#pprint(model.cluster_centers_)
distortion=sum(np.min(cdist(df_scaled, model.cluster_centers_, 'euclidean'), axis=1)) / df_scaled.shape[0]
meanDistortions.append(distortion)
print(f'Number of Clusters: {k} \t Average Distortion: {distortion:.3f}')
Number of Clusters: 2 Average Distortion: 1.718 Number of Clusters: 3 Average Distortion: 1.147 Number of Clusters: 4 Average Distortion: 1.142 Number of Clusters: 5 Average Distortion: 1.138 Number of Clusters: 6 Average Distortion: 1.132 Number of Clusters: 7 Average Distortion: 1.113 Number of Clusters: 8 Average Distortion: 1.106
#Plotting elbow plot
sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 2.5})
sns.lineplot(x=clusters, y=meanDistortions)
plt.xlabel('k')
plt.ylabel('Average Distortion')
plt.title('Selecting k with the Elbow Method', fontsize=20);
#Alternative elbowplot visualisation using yellowbrick
visualizer = KElbowVisualizer(AgglomerativeClusteringWithPredict(n_clusters=None,affinity='euclidean', linkage='average'))
visualizer.fit(df_scaled) # Fit the data to the visualizer
visualizer.show();
#Adding cluster labels as a column in the df
HCmodel = AgglomerativeClusteringWithPredict(n_clusters=3, affinity='euclidean', linkage='average')
HCmodel.fit_predict(df_scaled)
working_df['hc_clusters'] = HCmodel.labels_
showClustersIn2D(HCmodel, df_scaled, title='HC Clusters and Centroids')
x=working_df.groupby('hc_clusters').mean().iloc[:,:5]
x['no_of_rows'] = working_df.groupby('hc_clusters')['Avg_Credit_Limit'].count().values
x
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | no_of_rows | |
|---|---|---|---|---|---|---|
| hc_clusters | ||||||
| 0 | 33713.178295 | 5.511628 | 3.485788 | 0.984496 | 2.005168 | 387 |
| 1 | 141040.000000 | 8.740000 | 0.600000 | 10.900000 | 1.080000 | 50 |
| 2 | 12197.309417 | 2.403587 | 0.928251 | 3.560538 | 6.883408 | 223 |
fig, axes = plt.subplots(1, 5, figsize=(20, 8))
fig.suptitle('Boxplot of features for each cluster', fontsize=20)
for i,column in enumerate(working_df.columns[:-2]):
sns.boxplot(ax=axes[i],y=working_df[column],x=working_df.hc_clusters, palette='coolwarm_r')
fig.tight_layout(pad=2.0)
#Lets relabel Heirarchical clusters to better compare to kmeans
#Comparing the graphs with the kmean boxplots, it appears that the labels need to be right shifted by 1
#Code below for right shifting by 1 in the domain [0,1,2]
working_df['hc_clusters_relabel'] = working_df['hc_clusters'].map(lambda x: (x+1)%3)
(working_df.hc_clusters_relabel==working_df.kmeans_clusters).value_counts()
True 659 False 1 dtype: int64
working_df[working_df.hc_clusters_relabel!=working_df.kmeans_clusters]
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | kmeans_clusters | hc_clusters | hc_clusters_relabel | |
|---|---|---|---|---|---|---|---|---|
| 313 | 7000 | 4 | 2 | 2 | 4 | 0 | 0 | 1 |
x=working_df.groupby('kmeans_clusters').mean().iloc[:,:5]
x['no_of_rows'] = working_df.groupby('kmeans_clusters')['Avg_Credit_Limit'].count().values
y=working_df.groupby('hc_clusters_relabel').mean().iloc[:,:5]
y['no_of_rows'] = working_df.groupby('hc_clusters_relabel')['Avg_Credit_Limit'].count().values
display(x)
display(y)
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | no_of_rows | |
|---|---|---|---|---|---|---|
| kmeans_clusters | ||||||
| 0 | 12174.107143 | 2.410714 | 0.933036 | 3.553571 | 6.870536 | 224 |
| 1 | 33782.383420 | 5.515544 | 3.489637 | 0.981865 | 2.000000 | 386 |
| 2 | 141040.000000 | 8.740000 | 0.600000 | 10.900000 | 1.080000 | 50 |
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | no_of_rows | |
|---|---|---|---|---|---|---|
| hc_clusters_relabel | ||||||
| 0 | 12197.309417 | 2.403587 | 0.928251 | 3.560538 | 6.883408 | 223 |
| 1 | 33713.178295 | 5.511628 | 3.485788 | 0.984496 | 2.005168 | 387 |
| 2 | 141040.000000 | 8.740000 | 0.600000 | 10.900000 | 1.080000 | 50 |
fig, axes = plt.subplots(2, 5, figsize=(20, 8))
fig.suptitle('Boxplot (top-HC, bottom-Kmeans)', fontsize=20)
counter = 0
for i,column in enumerate(working_df.columns[:-3]):
sns.boxplot(ax=axes[0,i],y=working_df[column],x=working_df.hc_clusters_relabel, palette='coolwarm_r')
sns.boxplot(ax=axes[1,i],y=working_df[column],x=working_df.kmeans_clusters, palette='coolwarm_r')
fig.tight_layout(pad=2.0)
fig, axes = plt.subplots(1,2, figsize=(20,10))
showClustersIn2D(KMeans(n_clusters=3,random_state=random_state), df_scaled, ax=axes[0], centroid_size=100, point_size=40)
showClustersIn2D(AgglomerativeClusteringWithPredict(n_clusters=3, affinity='euclidean', linkage='average'), df_scaled, title="Left-Kmeans, Right-HC", ax=axes[1], centroid_size=100, point_size=40)
#note that have not relabelled HC clusters as with the boxplot above (not needed as the color codes are clear)
Cluster 0: (224 rows, 34%)
Cluster 1: (386 rows, 58%)
Cluster 2: (50 rows, 8%)
The clusters seem quite distinct and it appears the model works well in segregating the dataset into unique and distinct groups with their own behaviour characteristics
Based on the above, the bank would be well served to tune the delivery channels (physical, online, call) to the needs of each cluster - customers can be grouped by credit limit and cards
Cluster 0 make connections with the bank in the ratio 7:3.5:1 (phone: online: physical) ie the probability that they will choose to connect by phone is 60% and 31% by online.
Cluster 1 make connections in the ratio 2:1:3.5 (phone: online: physical) ie the probability that they will chose to connect physically is the highest (54%), followed by phone calls (31%)
Cluster 2 make connections in the ratio 1:11:.6 (phone: online: physical) ie the probabiliy that they will chose to connect online is 87%
Z = linkage(df_scaled, 'average', metric='euclidean') #since cophenetic analysis showed this to be the best combination
Z.shape
(659, 4)
plt.figure(figsize=(25, 10))
dendrogram(Z)
plt.show()
max_d = 40 # based on full denogram around this distance there are 3 clear clusters
# Using truncate_mode='lastp' attribute in dendrogram function to arrive at dendrogram
plt.figure(figsize=(10,5))
dendrogram(
Z,
truncate_mode='lastp', # show only the last p merged clusters
p=3, # show only the last p merged clusters
)
plt.show()
#Pulling out cluster labels
from scipy.cluster.hierarchy import fcluster
clusters = fcluster(Z, max_d, criterion='distance')
clusters
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
dtype=int32)
#Assiging cluster labels to a new column in the df
working_df['alt_hc_labels'] = clusters
working_df
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | kmeans_clusters | hc_clusters | hc_clusters_relabel | alt_hc_labels | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 100000 | 2 | 1 | 1 | 0 | 1 | 0 | 1 | 1 |
| 1 | 50000 | 3 | 0 | 10 | 9 | 0 | 2 | 0 | 1 |
| 2 | 50000 | 7 | 1 | 3 | 4 | 1 | 0 | 1 | 1 |
| 3 | 30000 | 5 | 1 | 1 | 4 | 1 | 0 | 1 | 1 |
| 4 | 100000 | 6 | 0 | 12 | 3 | 2 | 1 | 2 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 655 | 99000 | 10 | 1 | 10 | 0 | 2 | 1 | 2 | 1 |
| 656 | 84000 | 10 | 1 | 13 | 2 | 2 | 1 | 2 | 1 |
| 657 | 145000 | 8 | 1 | 9 | 1 | 2 | 1 | 2 | 1 |
| 658 | 172000 | 10 | 1 | 15 | 0 | 2 | 1 | 2 | 1 |
| 659 | 167000 | 9 | 0 | 12 | 2 | 2 | 1 | 2 | 1 |
660 rows × 9 columns
display(working_df.groupby('alt_hc_labels').count())
display(working_df.groupby('hc_clusters').count())
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | kmeans_clusters | hc_clusters | hc_clusters_relabel | |
|---|---|---|---|---|---|---|---|---|
| alt_hc_labels | ||||||||
| 1 | 660 | 660 | 660 | 660 | 660 | 660 | 660 | 660 |
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | kmeans_clusters | hc_clusters_relabel | alt_hc_labels | |
|---|---|---|---|---|---|---|---|---|
| hc_clusters | ||||||||
| 0 | 387 | 387 | 387 | 387 | 387 | 387 | 387 | 387 |
| 1 | 50 | 50 | 50 | 50 | 50 | 50 | 50 | 50 |
| 2 | 223 | 223 | 223 | 223 | 223 | 223 | 223 | 223 |
display(working_df.groupby('alt_hc_labels').mean())
display(working_df.groupby('hc_clusters').mean())
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | kmeans_clusters | hc_clusters | hc_clusters_relabel | |
|---|---|---|---|---|---|---|---|---|
| alt_hc_labels | ||||||||
| 1 | 34574.242424 | 4.706061 | 2.40303 | 2.606061 | 3.583333 | 0.736364 | 0.751515 | 0.737879 |
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | kmeans_clusters | hc_clusters_relabel | alt_hc_labels | |
|---|---|---|---|---|---|---|---|---|
| hc_clusters | ||||||||
| 0 | 33713.178295 | 5.511628 | 3.485788 | 0.984496 | 2.005168 | 0.997416 | 1.0 | 1.0 |
| 1 | 141040.000000 | 8.740000 | 0.600000 | 10.900000 | 1.080000 | 2.000000 | 2.0 | 1.0 |
| 2 | 12197.309417 | 2.403587 | 0.928251 | 3.560538 | 6.883408 | 0.000000 | 0.0 | 1.0 |
linkage_methods = ['ward', 'complete', 'average', 'single']
for l in linkage_methods:
HCmodel = AgglomerativeClusteringWithPredict(n_clusters=3, affinity='euclidean', linkage=l)
HCmodel.fit_predict(df_scaled)
working_df[f'hc_{l}'] = HCmodel.labels_
for l in linkage_methods:
x = working_df.groupby(f'hc_{l}').mean().iloc[:,:5]
x['no_of_rows'] = working_df.groupby(f'hc_{l}')['Avg_Credit_Limit'].count().values
display(x)
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | no_of_rows | |
|---|---|---|---|---|---|---|
| hc_ward | ||||||
| 0 | 33851.948052 | 5.516883 | 3.493506 | 0.979221 | 1.994805 | 385 |
| 1 | 12151.111111 | 2.422222 | 0.937778 | 3.546667 | 6.857778 | 225 |
| 2 | 141040.000000 | 8.740000 | 0.600000 | 10.900000 | 1.080000 | 50 |
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | no_of_rows | |
|---|---|---|---|---|---|---|
| hc_complete | ||||||
| 0 | 33151.133501 | 5.460957 | 3.405542 | 1.010076 | 2.060453 | 397 |
| 1 | 141040.000000 | 8.740000 | 0.600000 | 10.900000 | 1.080000 | 50 |
| 2 | 12234.741784 | 2.352113 | 0.957746 | 3.633803 | 7.009390 | 213 |
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | no_of_rows | |
|---|---|---|---|---|---|---|
| hc_average | ||||||
| 0 | 33713.178295 | 5.511628 | 3.485788 | 0.984496 | 2.005168 | 387 |
| 1 | 141040.000000 | 8.740000 | 0.600000 | 10.900000 | 1.080000 | 50 |
| 2 | 12197.309417 | 2.403587 | 0.928251 | 3.560538 | 6.883408 | 223 |
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | no_of_rows | |
|---|---|---|---|---|---|---|
| hc_single | ||||||
| 0 | 25807.881773 | 4.377668 | 2.555008 | 1.912972 | 3.779967 | 609 |
| 1 | 50000.000000 | 3.000000 | 0.000000 | 10.000000 | 9.000000 | 1 |
| 2 | 141040.000000 | 8.740000 | 0.600000 | 10.900000 | 1.080000 | 50 |